Skip to content

Conversation

@arsenm
Copy link
Contributor

@arsenm arsenm commented Aug 11, 2025

This should address the case where the result isn't fully used,
resulting in partial copy bundles from the MFMA result.

@llvmbot
Copy link
Member

llvmbot commented Aug 11, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

This should address the case where the result isn't fully used,
resulting in partial copy bundles from the MFMA result.


Patch is 56.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153019.diff

6 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp (+45-61)
  • (modified) llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir (-91)
  • (modified) llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir (+89-8)
  • (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir (+18-18)
  • (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir (+38-38)
  • (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll (+36-62)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index a8dfdbe5dd494..5206f32ec99e5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -14,8 +14,6 @@
 /// MFMA opcode.
 ///
 /// TODO:
-///  - Handle SplitKit partial copy bundles, and not just full copy instructions
-///
 ///  - Update LiveIntervals incrementally instead of recomputing from scratch
 ///
 //===----------------------------------------------------------------------===//
@@ -37,6 +35,7 @@ using namespace llvm;
 namespace {
 
 class AMDGPURewriteAGPRCopyMFMAImpl {
+  MachineFunction &MF;
   const GCNSubtarget &ST;
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
@@ -53,7 +52,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
                                 LiveRegMatrix &LRM, LiveIntervals &LIS,
                                 const RegisterClassInfo &RegClassInfo)
-      : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+      : MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
         TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
         LIS(LIS), RegClassInfo(RegClassInfo) {}
 
@@ -71,26 +70,26 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
   ///
   /// \p RewriteRegs will accumulate the set of register used by those MFMAs
   /// that need to have the register classes adjusted.
-  const TargetRegisterClass *recomputeRegClassExceptRewritable(
-      Register Reg, const TargetRegisterClass *OldRC,
-      const TargetRegisterClass *NewRC,
-      SmallVectorImpl<MachineInstr *> &RewriteCandidates,
+  bool recomputeRegClassExceptRewritable(
+      Register Reg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
       SmallSetVector<Register, 4> &RewriteRegs) const;
 
   bool run(MachineFunction &MF) const;
 };
 
-const TargetRegisterClass *
-AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
-    Register StartReg, const TargetRegisterClass *OldRC,
-    const TargetRegisterClass *NewRC,
-    SmallVectorImpl<MachineInstr *> &RewriteCandidates,
+bool AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
+    Register StartReg, SmallVectorImpl<MachineInstr *> &RewriteCandidates,
     SmallSetVector<Register, 4> &RewriteRegs) const {
   SmallVector<Register, 8> Worklist = {StartReg};
 
   // Recursively visit all transitive MFMA users
   while (!Worklist.empty()) {
     Register Reg = Worklist.pop_back_val();
+    const TargetRegisterClass *OldRC = MRI.getRegClass(Reg);
+
+    // Inflate to the equivalent AV_* class.
+    const TargetRegisterClass *NewRC = TRI.getLargestLegalSuperClass(OldRC, MF);
+
     // Accumulate constraints from all uses.
     for (MachineOperand &MO : MRI.reg_nodbg_operands(Reg)) {
       // Apply the effect of the given operand to NewRC.
@@ -101,23 +100,40 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
       // either AGPR or VGPR in src0/src1, so don't bother checking the
       // constraint effects of the individual operands.
       if (isRewriteCandidate(*MI)) {
-        for (AMDGPU::OpName OpName :
-             {AMDGPU::OpName::vdst, AMDGPU::OpName::src2}) {
-          const MachineOperand *Op = TII.getNamedOperand(*MI, OpName);
+        const MachineOperand *VDst =
+            TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
+        const MachineOperand *Src2 =
+            TII.getNamedOperand(*MI, AMDGPU::OpName::src2);
+        for (const MachineOperand *Op : {VDst, Src2}) {
           if (!Op->isReg())
             continue;
 
           Register OtherReg = Op->getReg();
-          if (OtherReg != Reg) {
-            if (RewriteRegs.insert(OtherReg))
-              Worklist.push_back(OtherReg);
-          }
+          if (OtherReg.isPhysical())
+            return false;
+
+          if (OtherReg != Reg && RewriteRegs.insert(OtherReg))
+            Worklist.push_back(OtherReg);
         }
 
-        LLVM_DEBUG(dbgs() << "Ignoring effects of " << *MI);
+        if (!is_contained(RewriteCandidates, MI)) {
+          LLVM_DEBUG({
+            Register VDstPhysReg = VRM.getPhys(VDst->getReg());
+            dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
+                   << " Dst=[" << printReg(VDst->getReg()) << " => "
+                   << printReg(VDstPhysReg, &TRI);
+
+            if (Src2->isReg()) {
+              Register Src2PhysReg = VRM.getPhys(Src2->getReg());
+              dbgs() << ", Src2=[" << printReg(Src2->getReg(), &TRI) << " => "
+                     << printReg(Src2PhysReg, &TRI);
+            }
+
+            dbgs() << "]: " << MI;
+          });
 
-        if (!is_contained(RewriteCandidates, MI))
           RewriteCandidates.push_back(MI);
+        }
 
         continue;
       }
@@ -126,13 +142,14 @@ AMDGPURewriteAGPRCopyMFMAImpl::recomputeRegClassExceptRewritable(
       NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, &TII, &TRI);
       if (!NewRC || NewRC == OldRC) {
         LLVM_DEBUG(dbgs() << "User of " << printReg(Reg, &TRI)
-                          << " cannot be reassigned to AGPR: " << *MI);
-        return nullptr;
+                          << " cannot be reassigned to "
+                          << TRI.getRegClassName(NewRC) << ": " << *MI);
+        return false;
       }
     }
   }
 
-  return NewRC;
+  return true;
 }
 
 /// Attempt to reassign the registers in \p InterferingRegs to be AGPRs, with a
@@ -225,10 +242,7 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
 
     for (VNInfo *VNI : LI.vnis()) {
       MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
-
-      // TODO: Handle SplitKit produced copy bundles for partially defined
-      // registers.
-      if (!DefMI || !DefMI->isFullCopy())
+      if (!DefMI || !DefMI->isCopy())
         continue;
 
       Register MFMADstReg = DefMI->getOperand(1).getReg();
@@ -241,34 +255,6 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
       if (!MFMA || !isRewriteCandidate(*MFMA))
         continue;
 
-      MachineOperand *Src2 = TII.getNamedOperand(*MFMA, AMDGPU::OpName::src2);
-      Register Src2Reg;
-      if (Src2->isReg()) {
-        Src2Reg = Src2->getReg();
-        if (!Src2Reg.isVirtual())
-          continue;
-      }
-
-      // FIXME: getMinimalPhysRegClass returns a nonsense AV_* subclass instead
-      // of an AGPR or VGPR subclass, so we can't simply use the result on the
-      // assignment.
-
-      LLVM_DEBUG({
-        dbgs() << "Attempting to replace VGPR MFMA with AGPR version:"
-               << " Dst=[" << printReg(VReg) << " => "
-               << printReg(PhysReg, &TRI);
-
-        if (Src2Reg) {
-          Register Src2PhysReg = VRM.getPhys(Src2Reg);
-          dbgs() << ", Src2=[" << printReg(Src2Reg, &TRI) << " => "
-                 << printReg(Src2PhysReg, &TRI);
-        }
-
-        dbgs() << "]: " << *MFMA;
-      });
-
-      const TargetRegisterClass *DstVirtRegRC = MRI.getRegClass(MFMADstReg);
-
       // src2 and dst have the same physical class constraint; try to preserve
       // the original src2 subclass if one were to exist.
       SmallVector<MachineInstr *, 4> RewriteCandidates = {MFMA};
@@ -287,11 +273,9 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const {
       //
       // Note recomputeRegClassExceptRewritable will consider the constraints of
       // this MFMA's src2 as well as the src2/dst of any transitive MFMA users.
-      const TargetRegisterClass *DstExceptRC =
-          recomputeRegClassExceptRewritable(MFMADstReg, DstVirtRegRC, VirtRegRC,
-                                            RewriteCandidates, RewriteRegs);
-      if (!DstExceptRC) {
-        LLVM_DEBUG(dbgs() << "Could not recompute the regclass of "
+      if (!recomputeRegClassExceptRewritable(MFMADstReg, RewriteCandidates,
+                                             RewriteRegs)) {
+        LLVM_DEBUG(dbgs() << "Could not recompute the regclass of dst reg "
                           << printReg(MFMADstReg, &TRI) << '\n');
         continue;
       }
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
index 3103d635200c6..45c185b6154ea 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-agpr-negative-tests.mir
@@ -20,10 +20,6 @@
     ret void
   }
 
-  define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg() #0 {
-    ret void
-  }
-
   define amdgpu_kernel void @inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first() #1 {
     ret void
   }
@@ -420,93 +416,6 @@ body:             |
 
 ...
 
-# Non-mac variant, src2 is the same VGPR, but a different subregister.
----
-name:            inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
-tracksRegLiveness: true
-machineFunctionInfo:
-  isEntryFunction: true
-  stackPtrOffsetReg: '$sgpr32'
-  occupancy:       10
-  sgprForEXECCopy: '$sgpr100_sgpr101'
-body:             |
-  ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit-def $agpr0
-  ; CHECK-NEXT:   renamable $sgpr0 = S_MOV_B32 0
-  ; CHECK-NEXT:   renamable $vgpr8 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   renamable $sgpr1 = COPY renamable $sgpr0
-  ; CHECK-NEXT:   renamable $vgpr18_vgpr19 = COPY killed renamable $sgpr0_sgpr1
-  ; CHECK-NEXT:   renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
-  ; CHECK-NEXT:   dead renamable $vgpr9 = COPY renamable $vgpr8
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT:   liveins: $vcc, $vgpr18_vgpr19
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-  ; CHECK-NEXT:   renamable $vgpr16_vgpr17 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-  ; CHECK-NEXT:   early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr18_vgpr19, $vgpr18_vgpr19, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
-  ; CHECK-NEXT:   S_BRANCH %bb.2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31:0x00000000FFFFFFFF
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-  ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
-  ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-  ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
-  ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
-  ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
-  ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-  ; CHECK-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr8_agpr9_agpr10_agpr11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr12_agpr13_agpr14_agpr15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX4_SADDR renamable $vgpr0, renamable $agpr0_agpr1_agpr2_agpr3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr0, killed renamable $agpr4_agpr5_agpr6_agpr7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
-  ; CHECK-NEXT:   S_ENDPGM 0
-  bb.0:
-    S_NOP 0, implicit-def $agpr0
-    renamable $sgpr0 = S_MOV_B32 0
-    undef %0.sub8:vreg_1024_align2 = V_MOV_B32_e32 0, implicit $exec
-    renamable $sgpr1 = COPY renamable $sgpr0
-    %1:vreg_64_align2 = COPY killed renamable $sgpr0_sgpr1
-    renamable $vcc = S_AND_B64 $exec, -1, implicit-def dead $scc
-    %0.sub9:vreg_1024_align2 = COPY %0.sub8
-
-  bb.1:
-    liveins: $vcc
-
-    undef %0.sub0_sub1:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-    %0.sub16_sub17:vreg_1024_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
-    %0.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:vreg_1024_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0.sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23_sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31, 0, 0, 0, implicit $mode, implicit $exec
-    S_CBRANCH_VCCNZ %bb.1, implicit $vcc
-    S_BRANCH %bb.2
-
-  bb.2:
-    ; No VGPRs available for %0
-    S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-    S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
-    S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-    S_NOP 0, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
-    S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
-    S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
-    S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub8_sub9_sub10_sub11, undef $sgpr0_sgpr1, 32, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub12_sub13_sub14_sub15, undef $sgpr0_sgpr1, 48, 0, implicit $exec :: (store (s128), addrspace 1)
-    GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub0_sub1_sub2_sub3, undef $sgpr0_sgpr1, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    GLOBAL_STORE_DWORDX4_SADDR %2, %0.sub4_sub5_sub6_sub7, killed undef $sgpr0_sgpr1, 16, 0, implicit $exec :: (store (s128), addrspace 1)
-    S_ENDPGM 0
-
-...
-
 # There isn't an assignable AGPR around the first MFMA.
 ---
 name:            inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_chain_no_agprs_first
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 3de86da766af7..735c7e081b21a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -1116,11 +1116,8 @@ body:             |
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-    ; CHECK-NEXT: renamable $vgpr0_vgpr1 = COPY killed renamable $agpr0_agpr1
     ; CHECK-NEXT: renamable $vgpr2_vgpr3 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_mac_e64 killed $vgpr2_vgpr3, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1202,10 +1199,8 @@ body:             |
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
     ; CHECK-NEXT: renamable $vgpr0_vgpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: renamable $vgpr18_vgpr19 = COPY killed renamable $agpr0_agpr1
-    ; CHECK-NEXT: early-clobber renamable $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X8F16_vgprcd_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
-    ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr2_vgpr3_vgpr4_vgpr5
+    ; CHECK-NEXT: renamable $agpr16_agpr17 = COPY killed renamable $agpr0_agpr1
+    ; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
@@ -1957,3 +1952,89 @@ body:             |
     S_ENDPGM 0
 
 ...
+
+# Non-mac variant, src2 is the same VGPR, but a different subregister.
+---
+name:            inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       10
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  ; CHECK-LABEL: name: inflate_result_to_agpr__V_MFMA_F32_32X32X8F16_vgprcd_e64_src2_different_subreg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_NOP 0, implicit-def $agpr0
+  ; CHECK-NEXT:   renamable $sgpr0 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $vgpr8 = V_MOV_B32_e32 0,...
[truncated]

; CHECK-NEXT: renamable $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed renamable $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17
; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY renamable $vgpr2_vgpr3_vgpr4_vgpr5
; CHECK-NEXT: renamable $agpr16_agpr17 = COPY killed renamable $agpr0_agpr1
; CHECK-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X8F16_e64 killed $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 0, 0, 0, implicit $mode, implicit $exec
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did we end up changing V_MFMA_F32_32X32X8F16_vgprcd_e64 ti V_MFMA_F32_32X32X8F16_e64?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This example is the whole point of doing this. Since the MFMA result isn't fully used, SplitKit only inserted a copy bundle of only the lanes that needed to be live, and not a full copy. This now treats the partial copies as starting points for the rewrite

@arsenm arsenm force-pushed the users/arsenm/amdgpu/allow-subregister-copies-mfma-rewrite-candidates branch from 3956361 to 603f8df Compare August 12, 2025 01:30
@arsenm arsenm force-pushed the users/arsenm/amdgpu/add-tests-rewrite-mfma-agpr-with-subreg-copies branch 2 times, most recently from 3776f12 to 69cd540 Compare August 18, 2025 15:31
@arsenm arsenm force-pushed the users/arsenm/amdgpu/allow-subregister-copies-mfma-rewrite-candidates branch from 603f8df to d4de0b2 Compare August 18, 2025 15:31
@arsenm
Copy link
Contributor Author

arsenm commented Aug 19, 2025

ping

@arsenm arsenm requested review from cdevadas and jayfoad August 19, 2025 15:10
Copy link
Contributor

@perlfu perlfu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, with one minor question in comments

@arsenm arsenm force-pushed the users/arsenm/amdgpu/add-tests-rewrite-mfma-agpr-with-subreg-copies branch from 69cd540 to dd011f2 Compare August 20, 2025 23:23
@arsenm arsenm force-pushed the users/arsenm/amdgpu/allow-subregister-copies-mfma-rewrite-candidates branch from d4de0b2 to 2ad172e Compare August 20, 2025 23:23
@arsenm arsenm force-pushed the users/arsenm/amdgpu/add-tests-rewrite-mfma-agpr-with-subreg-copies branch from dd011f2 to 8b45c13 Compare August 21, 2025 00:11
@arsenm arsenm force-pushed the users/arsenm/amdgpu/allow-subregister-copies-mfma-rewrite-candidates branch from 2ad172e to 7619736 Compare August 21, 2025 00:11
Base automatically changed from users/arsenm/amdgpu/add-tests-rewrite-mfma-agpr-with-subreg-copies to main August 21, 2025 00:39
arsenm added 2 commits August 21, 2025 09:41
This should address the case where the result isn't fully used,
resulting in partial copy bundles from the MFMA result.
@arsenm arsenm force-pushed the users/arsenm/amdgpu/allow-subregister-copies-mfma-rewrite-candidates branch from 7619736 to 92a0a65 Compare August 21, 2025 00:42
@arsenm arsenm enabled auto-merge (squash) August 21, 2025 00:43
@arsenm arsenm merged commit eefad74 into main Aug 21, 2025
9 checks passed
@arsenm arsenm deleted the users/arsenm/amdgpu/allow-subregister-copies-mfma-rewrite-candidates branch August 21, 2025 01:17
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants